In [5]:
import pandas as pd

pd.options.display.max_columns = 999


train= pd.read_csv('C:/Users/mosto/REPOSITORY/Prediction_of_Insurance_Premium/data/Train.csv', index_col=False)
test= pd.read_csv('C:/Users/mosto/REPOSITORY/Prediction_of_Insurance_Premium/data/Test.csv', index_col=False)
val= pd.read_csv('C:/Users/mosto/REPOSITORY/Prediction_of_Insurance_Premium/data/Validation.csv', index_col=False)

print(train.shape)
print(test.shape)
print(val.shape)
(6210, 54)
(1827, 54)
(1097, 54)
In [4]:
train.head()
Out[4]:
Washington Arizona Nevada California Oregon Customer Lifetime Value Response Coverage Education Employed Unemployed Medical_Leave Disabled Retired Female Male Income Suburban Rural Urban Married Single Divorced Monthly Premium Auto Months Since Last Claim Months Since Policy Inception Number of Open Complaints Number of Policies Corporate Auto Personal Auto Special Auto Corporate L3 Personal L3 Corporate L2 Personal L1 Special L2 Corporate L1 Personal L2 Special L1 Special L3 Renew Offer Type Agent Call Center Web Branch Total Claim Amount Two Door Four Door SUV Luxury SUV Sports Car Luxury Car Vehicle Size Activation_date
0 0 0 0 1 0 7659.72 No 1 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 73 22 93 0 2 0 1 0 0 0 0 0 0 0 1 0 0 Offer1 1 0 0 0 525.60 1 0 0 0 0 0 1 2011-02-06
1 0 0 0 1 0 4836.75 No 2 2 0 1 0 0 0 1 0 0 1 0 0 0 1 0 136 8 2 0 1 0 1 0 0 0 0 0 0 0 1 0 0 Offer1 0 0 0 1 979.20 0 0 1 0 0 0 1 2011-01-03
2 0 0 0 0 1 2648.47 No 1 1 0 0 0 1 0 1 0 26540 1 0 0 1 0 0 69 15 4 0 1 0 1 0 0 0 0 0 0 0 1 0 0 Offer2 0 0 0 1 378.43 0 1 0 0 0 0 2 2011-01-22
3 0 1 0 0 0 13575.68 No 1 2 1 0 0 0 0 1 0 48534 1 0 0 1 0 0 115 29 79 1 2 0 1 0 0 1 0 0 0 0 0 0 0 Offer4 0 1 0 0 552.00 0 0 1 0 0 0 1 2011-01-23
4 0 0 1 0 0 3494.15 No 2 1 1 0 0 0 0 0 1 35001 1 0 0 0 1 0 96 18 87 3 1 0 1 0 0 0 0 0 0 0 1 0 0 Offer1 0 0 1 0 460.80 0 1 0 0 0 0 1 2011-02-17
In [6]:
import plotly.express as px
In [7]:
px.scatter(train, 
            x = "Months Since Last Claim", 
            y = "Monthly Premium Auto", 
            trendline="ols")

# Explore relationship between month since last claim and insurance premium

# if month since last claim is 0, that means you recently had a claim
# if month since last claim is 35, that means you haven't had a clain in 35 months. 
In [8]:
px.scatter(train, 
            x = "Months Since Last Claim", 
            y = "Customer Lifetime Value", 
            trendline="ols")
In [9]:
px.scatter(train, 
            x="Customer Lifetime Value", 
            y="Monthly Premium Auto", 
            trendline="ols",
            color="Monthly Premium Auto")
In [10]:
# a sample of the data. 
train_sample = train.sample(n=100, random_state=42)
train_sample.shape
Out[10]:
(100, 54)
In [11]:
px.scatter_3d(train_sample, 
            x="Months Since Last Claim",
            y="Customer Lifetime Value", 
            z="Monthly Premium Auto",
            color="Months Since Last Claim")
In [12]:
train_sample.columns.to_list()
Out[12]:
['Washington',
 'Arizona',
 'Nevada',
 'California',
 'Oregon',
 'Customer Lifetime Value',
 'Response',
 'Coverage',
 'Education',
 'Employed',
 'Unemployed',
 'Medical_Leave',
 'Disabled',
 'Retired',
 'Female',
 'Male',
 'Income',
 'Suburban',
 'Rural',
 'Urban',
 'Married',
 'Single',
 'Divorced',
 'Monthly Premium Auto',
 'Months Since Last Claim',
 'Months Since Policy Inception',
 'Number of Open Complaints',
 'Number of Policies',
 'Corporate Auto',
 'Personal Auto',
 'Special Auto',
 'Corporate L3',
 'Personal L3',
 'Corporate L2',
 'Personal L1',
 'Special L2',
 'Corporate L1',
 'Personal L2',
 'Special L1',
 'Special L3',
 'Renew Offer Type',
 'Agent',
 'Call Center',
 'Web',
 'Branch',
 'Total Claim Amount',
 'Two Door',
 'Four Door',
 'SUV',
 'Luxury SUV',
 'Sports Car',
 'Luxury Car',
 'Vehicle Size',
 'Activation_date']
In [13]:
# Look at relationship between month since policy inception, and premium price

px.scatter(train_sample, x="Months Since Policy Inception", y="Monthly Premium Auto")
In [15]:
# Look at relationship between month since policy inception, and customer lifetime value

px.scatter(train_sample, x="Months Since Policy Inception", y="Customer Lifetime Value")
In [16]:
import seaborn as sns
In [17]:
# using seaborn to see from a high level perspective what the relationship between multiple variables. 

sns.pairplot(train_sample)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-17-90468ae1dc79> in <module>
      1 # using seaborn to see from a high level perspective what the relationship between multiple variables.
      2 
----> 3 sns.pairplot(train_sample)

~\anaconda3\envs\RTX-2080\lib\site-packages\seaborn\_decorators.py in inner_f(*args, **kwargs)
     44             )
     45         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46         return f(**kwargs)
     47     return inner_f
     48 

~\anaconda3\envs\RTX-2080\lib\site-packages\seaborn\axisgrid.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, corner, dropna, plot_kws, diag_kws, grid_kws, size)
   1972     if kind == "scatter":
   1973         from .relational import scatterplot  # Avoid circular import
-> 1974         plotter(scatterplot, **plot_kws)
   1975     elif kind == "reg":
   1976         from .regression import regplot  # Avoid circular import

~\anaconda3\envs\RTX-2080\lib\site-packages\seaborn\axisgrid.py in map_offdiag(self, func, **kwargs)
   1296         self.map_lower(func, **kwargs)
   1297         if not self._corner:
-> 1298             self.map_upper(func, **kwargs)
   1299         return self
   1300 

~\anaconda3\envs\RTX-2080\lib\site-packages\seaborn\axisgrid.py in map_upper(self, func, **kwargs)
   1279         """
   1280         indices = zip(*np.triu_indices_from(self.axes, 1))
-> 1281         self._map_bivariate(func, indices, **kwargs)
   1282         return self
   1283 

~\anaconda3\envs\RTX-2080\lib\site-packages\seaborn\axisgrid.py in _map_bivariate(self, func, indices, **kwargs)
   1432             y_var = self.y_vars[i]
   1433             ax = self.axes[i, j]
-> 1434             self._plot_bivariate(x_var, y_var, ax, func, **kws)
   1435         self._add_axis_labels()
   1436 

~\anaconda3\envs\RTX-2080\lib\site-packages\seaborn\axisgrid.py in _plot_bivariate(self, x_var, y_var, ax, func, **kwargs)
   1444             return
   1445 
-> 1446         plt.sca(ax)
   1447         kwargs = kwargs.copy()
   1448 

~\anaconda3\envs\RTX-2080\lib\site-packages\matplotlib\pyplot.py in sca(ax)
    967         raise ValueError("Axes parent figure is not managed by pyplot")
    968     _pylab_helpers.Gcf.set_active(ax.figure.canvas.manager)
--> 969     ax.figure.sca(ax)
    970 
    971 

~\anaconda3\envs\RTX-2080\lib\site-packages\matplotlib\figure.py in sca(self, a)
   2067     def sca(self, a):
   2068         """Set the current axes to be *a* and return *a*."""
-> 2069         self._axstack.bubble(a)
   2070         self._axobservers.process("_axes_change_event", self)
   2071         return a

~\anaconda3\envs\RTX-2080\lib\site-packages\matplotlib\figure.py in bubble(self, a)
    102         stack, to the top.
    103         """
--> 104         return super().bubble(self._entry_from_axes(a))
    105 
    106     def add(self, key, a):

~\anaconda3\envs\RTX-2080\lib\site-packages\matplotlib\cbook\__init__.py in bubble(self, o)
    626                 top_elements.append(elem)
    627             else:
--> 628                 self.push(elem)
    629         for _ in top_elements:
    630             self.push(o)

~\anaconda3\envs\RTX-2080\lib\site-packages\matplotlib\cbook\__init__.py in push(self, o)
    584         *o* is returned.
    585         """
--> 586         self._elements = self._elements[:self._pos + 1] + [o]
    587         self._pos = len(self._elements) - 1
    588         return self()

KeyboardInterrupt: 
In [18]:
a = sns.pairplot(train_sample, vars=['Customer Lifetime Value', 
                                "Coverage", 
                                "Education", 
                                "Income",
                                "Monthly Premium Auto", 
                                'Months Since Last Claim', 
                                "Months Since Policy Inception", 
                                "Number of Open Complaints",
                                "Number of Policies", 
                                "Total Claim Amount",
                                "Vehicle Size"])
In [19]:
b = sns.pairplot(train_sample, vars=['Customer Lifetime Value',
                                "Income",
                                "Monthly Premium Auto", 
                                'Months Since Last Claim', 
                                "Months Since Policy Inception", 
                                "Total Claim Amount"])
In [20]:
# look at total claim amount and monthly premium auto

px.scatter(train_sample, x='Monthly Premium Auto', y='Total Claim Amount')
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: